HTMLJsoupCleanerImpl.java example

Explorer
Tanaguru-master
/*
 * Tanaguru - Automated webpage assessment
 * Copyright (C) 2008-2015  Tanaguru.org
 *
 * This file is part of Tanaguru.
 *
 * Tanaguru is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Contact us by mail: tanaguru AT tanaguru DOT org
 */
package org.tanaguru.contentadapter.html;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Entities;
import org.jsoup.nodes.Node;
import org.tanaguru.contentadapter.HTMLCleaner;

/**
 * 
 * @author jkowalczyk
 */
public class HTMLJsoupCleanerImpl extends AbstractHTMLCleaner implements HTMLCleaner {
    
    private static final String EMPTY_NS_DEFINITION_PATTERN = "xmlns=\"(\\s)*\"";
    private static final String NS_TAG_OPEN_PREFIX_DEFINITION_PATTERN = "<a[0-9]+:";
    private static final String NS_TAG_CLOSURE_PREFIX_DEFINITION_PATTERN = "</a[0-9]+:";
    static final String CORRECTOR_NAME = "JsoupCleaner";

    public HTMLJsoupCleanerImpl() {
        super();
    }

    @Override
    public void run() {
        dirtyHTML = removeBadNamespaceDefinition(dirtyHTML);
        Document doc = Jsoup.parse(dirtyHTML);
        doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
        doc.outputSettings().outline(true);
        doc.outputSettings().indentAmount(2);
        removeComments(doc);
        removeMalformedAttributes(doc);
        result = doc.outerHtml();
   }

    /**
     * Remove the comments of the page 
     * 
     * @param node 
     */
    private void removeComments(Node node) {
        // as we are removing child nodes while iterating, we cannot use a normal foreach over children,
        // or will get a concurrent list modification error.
        int i = 0;
        while (i < node.childNodes().size()) {
            Node child = node.childNode(i);
            if (child.nodeName().equals("#comment"))
                child.remove();
            else {
                removeComments(child);
                i++;
            }
        }
    }
    
    /**
     * Remove the comments of the page 
     * 
     * @param node 
     */
    private void removeMalformedAttributes(Node node) {
        // as we are removing child nodes while iterating, we cannot use a normal foreach over children,
        // or will get a concurrent list modification error.
        int i = 0;
        while (i < node.childNodes().size()) {
            Node child = node.childNode(i);
            for (Attribute attr : child.attributes()) {
                if (attr.getKey().startsWith("\"") && attr.getKey().endsWith("\"")) {
                    child.removeAttr(attr.getKey());
                }
            }
            removeMalformedAttributes(child);
            i++;
        }
    }
    
    /**
     * Webdriver may return some html with namespace prefixed tag. This method
     * provides a clean operation on the source code, to enable its parse with
     * Jsoup
     * @param dirtyHTML
     * @return 
     */
    private String removeBadNamespaceDefinition(String dirtyHTML) {
        return dirtyHTML.replaceAll(EMPTY_NS_DEFINITION_PATTERN, "")
                        .replaceAll(NS_TAG_OPEN_PREFIX_DEFINITION_PATTERN, "<")
                        .replaceAll(NS_TAG_CLOSURE_PREFIX_DEFINITION_PATTERN, "</");
    }
    
    @Override
    public String getCorrectorName() {
        return CORRECTOR_NAME;
    }

}